import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
import funciones_auxiliares #Para poder llamar a una serie de funciones ya escritas en un archivo py
Primero cargamos los datos en un DataFrame
pd_loan=pd.read_csv('pd_data_initial_preprocessing.csv').drop(["id",'emp_title', "url", "zip_code"],axis=1)
pd_loan
C:\Users\lolo\AppData\Local\Temp/ipykernel_8696/2381297441.py:1: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
pd_loan=pd.read_csv('pd_data_initial_preprocessing.csv').drop(["id",'emp_title', "url", "zip_code"],axis=1)
| mths_since_last_record | il_util | mths_since_recent_bc_dlq | mths_since_rcnt_il | all_util | inq_fi | max_bal_bc | total_cu_tl | open_acc_6m | open_act_il | ... | initial_list_status | application_type | pub_rec | pub_rec_bankruptcies | loan_amnt | mort_acc | mo_sin_old_rev_tl_op | open_acc | earliest_cr_line_month | earliest_cr_line_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | 55.0 | 1.0 | 2315.0 | 0.0 | 0.0 | 0.0 | ... | w | Individual | 0.0 | 0.0 | 2300.0 | 0.0 | 381.0 | 4.0 | 9.0 | 1985.0 |
| 1 | NaN | 69.0 | NaN | 26.0 | 94.0 | 0.0 | 1962.0 | 6.0 | 2.0 | 1.0 | ... | w | Individual | 0.0 | 0.0 | 16000.0 | 7.0 | 300.0 | 9.0 | 6.0 | 1992.0 |
| 2 | NaN | NaN | NaN | 10.0 | 45.0 | 1.0 | 1950.0 | 5.0 | 1.0 | 0.0 | ... | w | Individual | 0.0 | 0.0 | 6025.0 | 2.0 | 180.0 | 11.0 | 6.0 | 2002.0 |
| 3 | NaN | 95.0 | NaN | 6.0 | 60.0 | 1.0 | 4240.0 | 1.0 | 1.0 | 10.0 | ... | w | Individual | 0.0 | 0.0 | 20400.0 | 0.0 | 125.0 | 15.0 | 1.0 | 2007.0 |
| 4 | NaN | 72.0 | 26.0 | 127.0 | 78.0 | 0.0 | 2996.0 | 0.0 | 0.0 | 1.0 | ... | w | Individual | 0.0 | 0.0 | 13000.0 | 1.0 | 184.0 | 5.0 | 2.0 | 2002.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 884879 | 84.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | w | Individual | 1.0 | 1.0 | 13000.0 | 0.0 | 108.0 | 9.0 | 9.0 | 2003.0 |
| 884880 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | w | Individual | 0.0 | 0.0 | 12000.0 | 0.0 | 83.0 | 17.0 | 10.0 | 2003.0 |
| 884881 | 110.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | f | Individual | 1.0 | 0.0 | 20000.0 | 0.0 | 155.0 | 8.0 | 12.0 | 2001.0 |
| 884882 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 884883 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
884884 rows × 47 columns
pd_loan.drop( pd_loan.tail(2).index,inplace=True)
Observamos que los dos ultimos datos tienen datos muy extraños, los eliminamos. En caso de ser necesario volveriamos aquí para verlos.
pd_loan.head(5).T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| mths_since_last_record | NaN | NaN | NaN | NaN | NaN |
| il_util | NaN | 69.0 | NaN | 95.0 | 72.0 |
| mths_since_recent_bc_dlq | NaN | NaN | NaN | NaN | 26.0 |
| mths_since_rcnt_il | NaN | 26.0 | 10.0 | 6.0 | 127.0 |
| all_util | 55.0 | 94.0 | 45.0 | 60.0 | 78.0 |
| inq_fi | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 |
| max_bal_bc | 2315.0 | 1962.0 | 1950.0 | 4240.0 | 2996.0 |
| total_cu_tl | 0.0 | 6.0 | 5.0 | 1.0 | 0.0 |
| open_acc_6m | 0.0 | 2.0 | 1.0 | 1.0 | 0.0 |
| open_act_il | 0.0 | 1.0 | 0.0 | 10.0 | 1.0 |
| open_il_12m | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 |
| total_bal_il | 0.0 | 11078.0 | 0.0 | 53566.0 | 8466.0 |
| open_rv_12m | 1.0 | 3.0 | 1.0 | 1.0 | 0.0 |
| open_rv_24m | 2.0 | 5.0 | 2.0 | 2.0 | 1.0 |
| open_il_24m | 0.0 | 0.0 | 2.0 | 1.0 | 0.0 |
| inq_last_12m | 2.0 | 1.0 | 3.0 | 1.0 | 0.0 |
| mths_since_last_major_derog | NaN | NaN | NaN | NaN | 16.0 |
| mths_since_recent_revol_delinq | NaN | NaN | NaN | NaN | 16.0 |
| mths_since_last_delinq | NaN | NaN | NaN | NaN | 16.0 |
| emp_length | NaN | 10.0 | 7.0 | 10.0 | 10.0 |
| mo_sin_old_il_acct | NaN | 26.0 | 86.0 | 125.0 | 160.0 |
| revol_util | 55.1 | 105.8 | 44.9 | 18.7 | 88.0 |
| dti | 21.61 | 25.61 | 8.88 | 27.06 | 6.79 |
| verification_status | Not Verified | Not Verified | Not Verified | Source Verified | Source Verified |
| annual_inc | 10000.0 | 94000.0 | 46350.0 | 44000.0 | 85000.0 |
| home_ownership | OWN | MORTGAGE | MORTGAGE | RENT | MORTGAGE |
| sub_grade | 1.0 | 1.0 | 4.0 | 1.0 | 5.0 |
| grade | C | C | C | B | B |
| term | 36.0 | 60.0 | 36.0 | 36.0 | 36.0 |
| int_rate | 12.62 | 12.62 | 15.05 | 9.44 | 11.99 |
| installment | 77.08 | 360.95 | 209.01 | 652.91 | 431.73 |
| loan_status | Current | Current | Current | Current | Current |
| total_acc | 6.0 | 26.0 | 27.0 | 19.0 | 24.0 |
| purpose | credit_card | debt_consolidation | home_improvement | car | debt_consolidation |
| addr_state | NY | MA | MA | CA | MN |
| fico_range_low | 700.0 | 715.0 | 700.0 | 760.0 | 660.0 |
| fico_range_high | 704.0 | 719.0 | 704.0 | 764.0 | 664.0 |
| initial_list_status | w | w | w | w | w |
| application_type | Individual | Individual | Individual | Individual | Individual |
| pub_rec | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| pub_rec_bankruptcies | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| loan_amnt | 2300.0 | 16000.0 | 6025.0 | 20400.0 | 13000.0 |
| mort_acc | 0.0 | 7.0 | 2.0 | 0.0 | 1.0 |
| mo_sin_old_rev_tl_op | 381.0 | 300.0 | 180.0 | 125.0 | 184.0 |
| open_acc | 4.0 | 9.0 | 11.0 | 15.0 | 5.0 |
| earliest_cr_line_month | 9.0 | 6.0 | 6.0 | 1.0 | 2.0 |
| earliest_cr_line_year | 1985.0 | 1992.0 | 2002.0 | 2007.0 | 2002.0 |
Vemos la información de la variable objetivo
pd_plot_loan_status_count=pd_loan['loan_status'].value_counts().reset_index() #Para el conteo absoluto
pd_plot_loan_status_percent=pd_loan['loan_status'].value_counts(normalize=True)\
.mul(100).rename('percent').reset_index() #Para el porcentaje
#Juntamos ambas en una misma tabla
pd_plot_loan_status=pd.merge(pd_plot_loan_status_count,pd_plot_loan_status_percent)
pd_plot_loan_status
| index | loan_status | percent | |
|---|---|---|---|
| 0 | Current | 422685 | 47.767710 |
| 1 | Fully Paid | 345520 | 39.047279 |
| 2 | Charged Off | 97047 | 10.967299 |
| 3 | Late (31-120 days) | 11168 | 1.262098 |
| 4 | In Grace Period | 5507 | 0.622347 |
| 5 | Late (16-30 days) | 2915 | 0.329425 |
| 6 | Default | 34 | 0.003842 |
Antes de analizar los datos, dado que nos vamos a centrar en predecir si se va a pagar, no nos interesan los datos de los clientes que aún no han terminado de pagar, es decir un clinete que se ha retrasado 1 mes en el pago puede que llegue a saldar su deuda, por ello nos vamos a restringir a dichos casos.
Primero necesito crear un pd con solo dos categorías en loan_status: Fully Pais y Charged Off
pd_loan = pd_loan[(pd_loan['loan_status']=='Fully Paid') |
(pd_loan['loan_status']=='Charged Off')]
Vemos la forma de los datos
pd_loan.shape
(442567, 47)
Analizamos cual es el porcentaje de valores nulos:
missing=pd_loan.isna().mean()
missing
mths_since_last_record 0.812372 il_util 0.951273 mths_since_recent_bc_dlq 0.738062 mths_since_rcnt_il 0.945267 all_util 0.944006 inq_fi 0.943999 max_bal_bc 0.943999 total_cu_tl 0.943999 open_acc_6m 0.943999 open_act_il 0.943999 open_il_12m 0.943999 total_bal_il 0.943999 open_rv_12m 0.943999 open_rv_24m 0.943999 open_il_24m 0.943999 inq_last_12m 0.943999 mths_since_last_major_derog 0.706553 mths_since_recent_revol_delinq 0.640341 mths_since_last_delinq 0.487339 emp_length 0.053944 mo_sin_old_il_acct 0.028651 revol_util 0.000484 dti 0.000041 verification_status 0.000000 annual_inc 0.000000 home_ownership 0.000000 sub_grade 0.000000 grade 0.000000 term 0.000000 int_rate 0.000000 installment 0.000000 loan_status 0.000000 total_acc 0.000000 purpose 0.000000 addr_state 0.000000 fico_range_low 0.000000 fico_range_high 0.000000 initial_list_status 0.000000 application_type 0.000000 pub_rec 0.000000 pub_rec_bankruptcies 0.000000 loan_amnt 0.000000 mort_acc 0.000000 mo_sin_old_rev_tl_op 0.000000 open_acc 0.000000 earliest_cr_line_month 0.000000 earliest_cr_line_year 0.000000 dtype: float64
Hay una gran cantidad de variables con una gran cantidad de missings tanto en en los pagados como en los charged off, imputarlos podría suponer un problema, veremos en el punto dos cómo son las variables para ver que podemos hacer con ellas.
Veamos de qué tipos son las variables a estudiar
pd_loan.dtypes
mths_since_last_record float64 il_util float64 mths_since_recent_bc_dlq float64 mths_since_rcnt_il float64 all_util float64 inq_fi float64 max_bal_bc float64 total_cu_tl float64 open_acc_6m float64 open_act_il float64 open_il_12m float64 total_bal_il float64 open_rv_12m float64 open_rv_24m float64 open_il_24m float64 inq_last_12m float64 mths_since_last_major_derog float64 mths_since_recent_revol_delinq float64 mths_since_last_delinq float64 emp_length float64 mo_sin_old_il_acct float64 revol_util float64 dti float64 verification_status object annual_inc float64 home_ownership object sub_grade float64 grade object term float64 int_rate float64 installment float64 loan_status object total_acc float64 purpose object addr_state object fico_range_low float64 fico_range_high float64 initial_list_status object application_type object pub_rec float64 pub_rec_bankruptcies float64 loan_amnt float64 mort_acc float64 mo_sin_old_rev_tl_op float64 open_acc float64 earliest_cr_line_month float64 earliest_cr_line_year float64 dtype: object
Sabemos que hay variables enteras que no se han importado como tal por tener nan.
Creamos tres listas unas con las variables continuas , otra con las variables categoricas y otra con la variables enteras
list_var_continuous = list(pd_loan.select_dtypes('float').columns)
list_var_discrets = list(pd_loan.select_dtypes('O').columns)
list_var_int=[]
list_var_float=[]
for col in list_var_continuous:
if (pd_loan[col]%1).sum()==0:
list_var_int.append(col)
else:
list_var_float.append(col)
list_var_int
['mths_since_last_record', 'il_util', 'mths_since_recent_bc_dlq', 'mths_since_rcnt_il', 'all_util', 'inq_fi', 'max_bal_bc', 'total_cu_tl', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'total_bal_il', 'open_rv_12m', 'open_rv_24m', 'open_il_24m', 'inq_last_12m', 'mths_since_last_major_derog', 'mths_since_recent_revol_delinq', 'mths_since_last_delinq', 'emp_length', 'mo_sin_old_il_acct', 'sub_grade', 'term', 'total_acc', 'fico_range_low', 'fico_range_high', 'pub_rec', 'pub_rec_bankruptcies', 'loan_amnt', 'mort_acc', 'mo_sin_old_rev_tl_op', 'open_acc', 'earliest_cr_line_month', 'earliest_cr_line_year']
pd_loan[list_var_float]
| revol_util | dti | annual_inc | int_rate | installment | |
|---|---|---|---|---|---|
| 8 | 22.7 | 16.99 | 153000.0 | 7.35 | 359.26 |
| 10 | 98.9 | 6.07 | 50000.0 | 24.85 | 285.70 |
| 24 | 27.2 | 13.12 | 110000.0 | 7.35 | 232.79 |
| 42 | 20.0 | 10.11 | 51979.0 | 16.02 | 243.29 |
| 91 | 90.0 | 10.86 | 75000.0 | 16.02 | 492.34 |
| ... | ... | ... | ... | ... | ... |
| 884875 | 65.0 | 27.08 | 25400.0 | 15.59 | 404.61 |
| 884876 | 69.7 | 23.69 | 63000.0 | 11.99 | 398.52 |
| 884879 | 61.3 | 30.90 | 35000.0 | 15.99 | 316.07 |
| 884880 | 30.6 | 27.19 | 64400.0 | 19.99 | 317.86 |
| 884881 | 79.8 | 10.83 | 100000.0 | 11.99 | 664.20 |
442567 rows × 5 columns
Realizamos un histograma y comparamos datos para comprobar cómo de desvalanceados están los datos que vamos a analizar.
Para el conteo absoluto:
pd_plot_loan_status_count=pd_loan['loan_status'].value_counts().reset_index()
Para el porcentaje:
pd_plot_loan_status_percent=pd_loan['loan_status'].value_counts(normalize=True)\
.mul(100).rename('percent').reset_index()
Los unimos:
pd_plot_loan_status=pd.merge(pd_plot_loan_status_count,pd_plot_loan_status_percent)
Hacemos el histograma.
fig = px.histogram(pd_plot_loan_status, x="index", y=['percent'])
fig.show()
pd_plot_loan_status_count
| index | loan_status | |
|---|---|---|
| 0 | Fully Paid | 345520 |
| 1 | Charged Off | 97047 |
Observamos hay una gran diferencia de tamaños, una forma de solventarlo dado que tenemos muchos datos es eliminar de forma aleatoria 345520-97047 datos de las personas que saldaron la deuda(undersampling).
Parece que están bastante desvalanceados haremos un valanceo en los puntos siguientes
Para ello deberemos hacer uso de las funciones auxiliares que nos ayudarán a calcular la matriz de correlaciones de las variables continuas.
Importamos todos los datos con los posibles estados
funciones_auxiliares.get_corr_matrix(dataset = pd_loan[list_var_continuous],
metodo='pearson', size_figure=[10,8])
0
Eliminaremos variables que tienen alta correlación con otras que si dejaremos:
pd_loan.drop(['mo_sin_old_rev_tl_op','fico_range_low','open_rv_12m','open_il_12m','earliest_cr_line_year'],1)
C:\Users\lolo\AppData\Local\Temp/ipykernel_8696/1557204909.py:1: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.
| mths_since_last_record | il_util | mths_since_recent_bc_dlq | mths_since_rcnt_il | all_util | inq_fi | max_bal_bc | total_cu_tl | open_acc_6m | open_act_il | ... | addr_state | fico_range_high | initial_list_status | application_type | pub_rec | pub_rec_bankruptcies | loan_amnt | mort_acc | open_acc | earliest_cr_line_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8 | 84.0 | 63.0 | NaN | 27.0 | 36.0 | 1.0 | 1581.0 | 6.0 | 1.0 | 1.0 | ... | CA | 724.0 | w | Individual | 1.0 | 1.0 | 11575.0 | 2.0 | 20.0 | 7.0 |
| 10 | NaN | 51.0 | NaN | 21.0 | 74.0 | 0.0 | 2779.0 | 0.0 | 0.0 | 1.0 | ... | WA | 689.0 | w | Individual | 0.0 | 0.0 | 7200.0 | 0.0 | 4.0 | 1.0 |
| 24 | NaN | 70.0 | NaN | 7.0 | 39.0 | 6.0 | 5965.0 | 8.0 | 1.0 | 4.0 | ... | TX | 714.0 | w | Individual | 0.0 | 0.0 | 7500.0 | 4.0 | 19.0 | 3.0 |
| 42 | 55.0 | 95.0 | NaN | 9.0 | 32.0 | 0.0 | 3898.0 | 0.0 | 1.0 | 1.0 | ... | CA | 694.0 | w | Individual | 2.0 | 2.0 | 10000.0 | 0.0 | 15.0 | 8.0 |
| 91 | NaN | 82.0 | NaN | 7.0 | 83.0 | 3.0 | 2700.0 | 0.0 | 1.0 | 2.0 | ... | MA | 689.0 | w | Individual | 0.0 | 0.0 | 14000.0 | 1.0 | 4.0 | 5.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 884875 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | TX | 684.0 | f | Individual | 0.0 | 0.0 | 11575.0 | 0.0 | 9.0 | 7.0 |
| 884876 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | OH | 689.0 | f | Individual | 0.0 | 0.0 | 12000.0 | 1.0 | 13.0 | 6.0 |
| 884879 | 84.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | TN | 684.0 | w | Individual | 1.0 | 1.0 | 13000.0 | 0.0 | 9.0 | 9.0 |
| 884880 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | MA | 699.0 | w | Individual | 0.0 | 0.0 | 12000.0 | 0.0 | 17.0 | 10.0 |
| 884881 | 110.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | FL | 679.0 | f | Individual | 1.0 | 0.0 | 20000.0 | 0.0 | 8.0 | 12.0 |
442567 rows × 42 columns
comprobamos los valores missing que tienen las variables categoricas
missing[list_var_discrets]
verification_status 0.0 home_ownership 0.0 grade 0.0 loan_status 0.0 purpose 0.0 addr_state 0.0 initial_list_status 0.0 application_type 0.0 dtype: float64
Como no tienen, los dejamos como están, hacemos lo mismo con las variables continuas y las enteras
missing[list_var_int]
mths_since_last_record 0.812372 il_util 0.951273 mths_since_recent_bc_dlq 0.738062 mths_since_rcnt_il 0.945267 all_util 0.944006 inq_fi 0.943999 max_bal_bc 0.943999 total_cu_tl 0.943999 open_acc_6m 0.943999 open_act_il 0.943999 open_il_12m 0.943999 total_bal_il 0.943999 open_rv_12m 0.943999 open_rv_24m 0.943999 open_il_24m 0.943999 inq_last_12m 0.943999 mths_since_last_major_derog 0.706553 mths_since_recent_revol_delinq 0.640341 mths_since_last_delinq 0.487339 emp_length 0.053944 mo_sin_old_il_acct 0.028651 sub_grade 0.000000 term 0.000000 total_acc 0.000000 fico_range_low 0.000000 fico_range_high 0.000000 pub_rec 0.000000 pub_rec_bankruptcies 0.000000 loan_amnt 0.000000 mort_acc 0.000000 mo_sin_old_rev_tl_op 0.000000 open_acc 0.000000 earliest_cr_line_month 0.000000 earliest_cr_line_year 0.000000 dtype: float64
missing[list_var_float]
revol_util 0.000484 dti 0.000041 annual_inc 0.000000 int_rate 0.000000 installment 0.000000 dtype: float64
Primero haremos una imputación manual sobre las variables enteras que lo necesiten transformandolas posteriormente a enteros.
Para las variables:
mths_since_rcnt_il,
mths_since_last_record,
mths_since_recent_bc_dlq,
mths_since_last_major_derog,
mths_since_last_delinq,
mo_sin_old_il_acct,
mths_since_recent_revol_delinq.
Escogeremos asignar a los nan datos muy elevados ya que suelen ser valores de cantidad de meses sin haber cometido un delito o abrirse cuentas, así que si no pasa podemos asignarles valores elevados a la hora de hacer cálculos y lo interpretará como que llevan medio siglo sin cometer delitos o abrirse cuentas.
listremove=list_var_int.copy()
lista=['mths_since_last_record',
'mths_since_recent_bc_dlq',
'mths_since_last_major_derog',
'mths_since_last_delinq',
'mo_sin_old_il_acct',
'mths_since_rcnt_il',
'mths_since_recent_revol_delinq']
for i in lista:
listremove.remove(i)
pd_loan[lista]=pd_loan[lista].fillna(500)
C:\Users\lolo\AppData\Local\Temp/ipykernel_8696/2272314355.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Para las variables:
open_act_il,
open_acc_6m,
open_il_12m,
open_rv_12m,
open_il_24m,
open_rv_24m,
total_cu_tl,
inq_fi,
inq_last_12m,
emp_length.
Son datos sobre cuentas abiertas, si no tiene datos lo dejaremos a 0, es decir supondremos que no se han abierto una cuenta
lista=['open_acc_6m',
'open_il_12m',
'open_rv_12m',
'open_il_24m',
'open_rv_24m' ,
'open_act_il',
'total_cu_tl',
'inq_fi',
'inq_last_12m',
'emp_length']
for i in lista:
listremove.remove(i)
pd_loan[lista]=pd_loan[lista].fillna(0)
C:\Users\lolo\AppData\Local\Temp/ipykernel_8696/859673671.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Las demás variables las imputaremos.
from sklearn.impute import SimpleImputer
#from sklearn.impute import KNNImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#imputer =KNNImputer(n_neighbors=10, weights='uniform')
imputer.fit(pd_loan[list_var_continuous])
#imputer.transform(pd_loan[list_var_continuous])
pd_loan[list_var_continuous]=imputer.transform(pd_loan[list_var_continuous])
C:\Users\lolo\AppData\Local\Temp/ipykernel_8696/1696139643.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Transformamos las variables enteras en int.
pd_loan[list_var_int]=pd_loan[list_var_int].astype(int)
C:\Users\lolo\AppData\Local\Temp/ipykernel_8696/4174253711.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Codificamos la variables categóricas
y=pd_loan['loan_status']
y=y.replace({'Charged Off':0,'Fully Paid':1})#True ha pagado, false no
pd_loan=pd_loan.drop('loan_status',axis=1)
list_var_discrets.remove('loan_status')
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
for col in list_var_discrets:
label_encoder = enc.fit(pd_loan[col])
pd_loan[col] = label_encoder.transform(pd_loan[col]) + 1
Normalizamos los datos
from sklearn import preprocessing
scaler = preprocessing.scale(pd_loan[list_var_continuous], axis=0, with_mean=True, with_std=True, copy=True)
pd_loan[list_var_continuous] = scaler
Haremos un balanceo de la variable objetivo que tal y como vimos estaba muy desbalanceada, dado que tenemos una gran cantidad de datos haremos un undersampling a la clase mayoritaria para que el ordenador pueda manejarlo.
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics.pairwise import euclidean_distances
#samply = RandomOverSampler(sampling_strategy='minority',random_state = 0)
samply = RandomUnderSampler(random_state = 0)
X , y = samply.fit_resample(pd_loan,y)
Generamos los conjuntos de entrenamiento y de test:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6)
def todas_metricas(X_train, X_test, y_pred, y_test):
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
confmat = confusion_matrix(y_test,y_pred) #Matriz de confusión
funciones_auxiliares.plot_confusion_matrix(confmat)
ejex, ejey, _ = roc_curve(y_test, y_pred) #Curva Roc
roc_auc = auc(ejex, ejey)
plt.figure()
plt.plot(ejex, ejey, color='darkorange',lw=2, label='AUC = %0.2f' % roc_auc)
plt.plot([0, 1], [0, 1], color=(0.6, 0.6, 0.6), linestyle='--')
plt.plot([0, 0, 1],[0, 1, 1],lw=2, linestyle=':',color='black',label='Clasificador perfecto')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('FPR (1-ESP)')
plt.ylabel('SEN')
plt.legend(loc="lower right")
plt.show()
funciones_auxiliares.calcula_metricas(confmat) #Accuracy, Precision,True negative rate, Recall, F-Score
return confmat
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, C = 1)
model_glm = clf.fit(X_train, y_train)
y_pred = model_glm.predict_proba(X_test)
y_pred_def=[]
for i in y_pred:
if i[0]<0.5:
y_pred_def.append(1)
else:
y_pred_def.append(0)
y_pred_reg=y_pred_def
C:\Users\lolo\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
reg_mat=todas_metricas(X_train, X_test, y_pred_reg, y_test)
ACC: 0.6624969087461875 SEN: 0.6734012287139736 ESP: 0.6516006757035144 PPV: 0.6588671938034533 FSC: 0.6660549336269651
Usaremos solo las variables continuas por ser muy costoso computacionalmente.
def which_k(X_train,X_test,y_train,y_pred,a,b):
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
ks = list(range(a,b))
accs={}
for k in ks:
print(k)
# Definimos el modelo con el valor de hiperparámetro correspondiente
knn = KNeighborsClassifier(n_neighbors=k)
# Ajustamos a los datos de entrenamiento
knn.fit(X_train, y_train)
# Hacemos predicciones sobre los datos de test
y_pred = knn.predict(X_test)
# Evaluamos y guardamos la métrica correspondiente (en este caso accuracy)
acc =accuracy_score(y_test, y_pred)
print(acc)
accs[k] = acc
return accs
Comentamos la función por ser computacionalmente costosa pero abajo ponemos nuestros resultados.
#which_k(X_train[list_var_continuous],X_test[list_var_continuous],y_train,y_pred,9,17)
9
0.6224548676943368
10
0.6203115983843047
11
0.6241859698293628
12
0.6238768444481082
13
0.6275657406644135
14
0.6276069573819141
15
0.6305127359657077
16
0.6304509108894568
150
0.6494312092984915
300
0.6504822355947573
1000
0.6505646690297585
1001
0.6502555436485038
2000
0.6497815513972467
Explorar es computacionalmente costoso y no poseemos ordenadores lo suficientemente potentes, por lo que nos quedamos con el que mejor accuraci saca de las prueba que hemos hecho k=1000.
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1000)
knn.fit(X_train[list_var_continuous], y_train)
y_pred_knn=knn.predict(X_test[list_var_continuous])
knn_mat=todas_metricas(X_train[list_var_continuous], X_test[list_var_continuous], y_pred_knn, y_test)
ACC: 0.6505646690297585 SEN: 0.6909248340411496 ESP: 0.6102344361583784 PPV: 0.6391654270130068 FSC: 0.6640380424014266
from sklearn.tree import DecisionTreeClassifier
def which_k_tree(X_train,X_test,y_train,y_pred,a,b):
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
ks = list(range(a,b))
accs={}
for k in ks:
print(k)
# Definimos el modelo con el valor de hiperparámetro correspondiente
arbol = DecisionTreeClassifier(max_depth=k)
# Ajustamos a los datos de entrenamiento
arbol.fit(X_train[list_var_continuous],y_train)
# Hacemos predicciones sobre los datos de test
y_pred= arbol.predict(X_test[list_var_continuous])
# Evaluamos y guardamos la métrica correspondiente (en este caso accuracy)
acc =accuracy_score(y_test, y_pred)
print(acc)
accs[k] = acc
return accs
which_k_tree(X_train,X_test,y_train,y_pred,1,20)
1 0.6271123567719067 2 0.6271123567719067 3 0.6329033055807436 4 0.6446088533509191 5 0.6445676366334185 6 0.6468757728134531 7 0.650337977083505 8 0.6492869507872393 9 0.6496991179622454 10 0.6468963811722035 11 0.6438257357184074 12 0.6408581320583628 13 0.6357060423707855 14 0.629729618333196 15 0.6238768444481082 16 0.6203734234605556 17 0.6128101557991922 18 0.607802324622867 19 0.6013725166927706
{1: 0.6271123567719067,
2: 0.6271123567719067,
3: 0.6329033055807436,
4: 0.6446088533509191,
5: 0.6445676366334185,
6: 0.6468757728134531,
7: 0.650337977083505,
8: 0.6492869507872393,
9: 0.6496991179622454,
10: 0.6468963811722035,
11: 0.6438257357184074,
12: 0.6408581320583628,
13: 0.6357060423707855,
14: 0.629729618333196,
15: 0.6238768444481082,
16: 0.6203734234605556,
17: 0.6128101557991922,
18: 0.607802324622867,
19: 0.6013725166927706}
Observamos que k=7 parece ser el mejor modelo
arbol = DecisionTreeClassifier(max_depth=7)
arbol.fit(X_train,y_train)
DecisionTreeClassifier(max_depth=7)
y_pred_tree= arbol.predict(X_test)
tree_mat=todas_metricas(X_train, X_test,y_pred_tree, y_test)
ACC: 0.6511210947160169 SEN: 0.5997196223147652 ESP: 0.7024844464587368 PPV: 0.6682440503537628 FSC: 0.6321302070883765
La accuracy y F-Score resumen con un solo dato lo bueno que es el modelo y viene bien para hacer un análisis poco exhaustivo , pero se quedan bastante corto , la curva roc puede estar bien, pero la que mejor resume lo bueno que el modelo, es la matriz de confusión, que te resume en que falla y en que no falla, aunque es difícil de comparar con otros modelos.
Ahora ordenaremos por orden los valores de las métricas
Por orden es:
Regresión con:0.6624969087461875
Decision_tree con:0.6511210947160169
knn con:0.6505646690297585
Por orden es:
Regresión con: 0.6660549336269651
knn con:0.6511210947160169
Decision_tree con: 0.6321302070883765
Por orden es:
knn con:0.6909248340411496
Regresión con: 0.6734012287139736
Decision_tree con:0.5997196223147652
Por orden es:
Decision_tree con:0.7024844464587368
Regresión con: 0.6516006757035144
knn con:0.6102344361583784
Por orden es:
Decision_tree con:0.6682440503537628
Regresión con: 0.6516006757035144
knn con:0.6588671938034533
Por orden es:
Regresión con: 0.6660549336269651
knn con: 0.6640380424014266
Decision_tree con:0.6321302070883765
funciones_auxiliares.plot_confusion_matrix(reg_mat)
funciones_auxiliares.plot_confusion_matrix(tree_mat)
funciones_auxiliares.plot_confusion_matrix(knn_mat)
Tras todos los análisis concluimos que pese a que sigue siendo un modelo regulero por fallar tanto, el mejor es el de regresión logística por ser el que suele tener mejores métricas tiene y mejor matriz de confusión.